Import the data sets extracted from the UCPH_Data Preparation R Markdown.
list.files()
## [1] "endpoint.txt" "plant_info.txt" "S_timeseries.txt"
## [4] "T_timeseries.txt" "testtemplate" "timeseries.txt"
## [7] "UCPH_DataAnalysis.html" "UCPH_DataAnalysis.Rmd"
plant_info <- read.table("plant_info.txt", header = TRUE, sep = "\t")
endpoint <- read.table("endpoint.txt", header = TRUE, sep = "\t")
S_timeseries <- read.table("S_timeseries.txt", header = TRUE, sep = "\t")
We must convert the columns to factor and date formats.
# plant_info
plant_info <- lapply(plant_info, factor)
# endpoint
matching_cols <- intersect(names(endpoint), names(plant_info))
endpoint[, matching_cols] <- lapply(endpoint[, matching_cols], factor)
endpoint$Date <- date(endpoint$Date)
endpoint$Timestamp <- NA
# timeseries
# No data for UCPH
# S_timeseries
matching_cols <- intersect(names(S_timeseries), names(plant_info))
S_timeseries[, matching_cols] <- lapply(S_timeseries[, matching_cols], factor)
S_timeseries$Timestamp <- as.POSIXct(S_timeseries$Timestamp, format = "%Y-%m-%d %H:%M:%S")
S_timeseries$Date <- date(S_timeseries$Date)
# T_timeseries
# No data for UCPH
This part extracts the variables in the endpoint dataframe.
df <- endpoint[,colSums(is.na(endpoint))<nrow(endpoint)]
genotype_index <- which(colnames(df) == "Genotype")
variables <- colnames(df[, c(3:(genotype_index - 1))]) # We remove the three first columns that are "Unit.ID","Time" and "Date"
variables
## [1] "DW_shoot_g" "DW_root_g"
The variables for NaPPI are “DW_shoot_g” and “FW_shoot_g”
get_summary_stats(data = endpoint,
variables[1], variables[2],
type = "common")
## # A tibble: 2 × 10
## variable n min max median iqr mean sd se ci
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DW_shoot_g 107 0.768 8 3.22 2.00 3.46 1.65 0.16 0.317
## 2 DW_root_g 108 0.1 5 0.8 0.7 0.982 0.832 0.08 0.159
endpoint %>%
count(Genotype)
## Genotype n
## 1 EPPN_T 12
## 2 EPPN1_H 12
## 3 EPPN1_L 12
## 4 EPPN2_H 12
## 5 EPPN2_L 12
## 6 EPPN3_H 12
## 7 EPPN3_L 12
## 8 EPPN4_H 12
## 9 EPPN4_L 12
skim(endpoint[, unlist(variables)])
| Name | endpoint[, unlist(variabl… |
| Number of rows | 108 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| DW_shoot_g | 1 | 0.99 | 3.46 | 1.65 | 0.77 | 2.3 | 3.22 | 4.3 | 8 | ▅▇▅▂▁ |
| DW_root_g | 0 | 1.00 | 0.98 | 0.83 | 0.10 | 0.5 | 0.80 | 1.2 | 5 | ▇▂▁▁▁ |
## Boxplots
# We add a variable PlantType
endpoint$Plant_type <- substr(endpoint$Genotype, nchar(as.character(endpoint$Genotype)), nchar(as.character(endpoint$Genotype)))
boxplot1 <- ggplot(endpoint, aes(y = endpoint[ , colnames(endpoint)==variables[1]]))+
geom_boxplot(aes(x = Genotype, fill = Plant_type))+
labs(title = variables[1], y = variables[1])
boxplot2 <- ggplot(endpoint, aes(y = endpoint[ , colnames(endpoint)==variables[2]]))+
geom_boxplot(aes(x = Genotype, fill = Plant_type))+
labs(title = variables[2], y = variables[2])
combined <- boxplot1 + boxplot2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
Remove the outliers, replacing them with NULL values and normality verification of residuals.
# Create a clean dataset that will contain the data without the outlyers
endpoint_clean <- endpoint
## Outlayer detection function
isoutlayer <- function(independent_variable, genotype, proba = 0.05) {
data <- data.frame(genotype, independent_variable)
data <- na.omit(data)
data$genotype <- as.factor(data$genotype)
# Linear model fit
fit <- lm(independent_variable ~ genotype, data)
# Density histogram of residuals - Normality test
density_histogram <- ggplot(data = data.frame(x = fit$residuals), aes(x = x)) +
geom_histogram(aes(y = ..density..), bins = 20, color = 4, fill = 4, alpha = 0.5) +
labs(title = "Density histogram of residuals", x = "Residuals", y = "Density")+
geom_density(color = "blue", size = 0.7, linetype = "dashed")+
stat_function(fun = dnorm, args = list(mean = mean(fit$residuals), sd = sd(fit$residuals)), color = "red", size = 0.7)
# Boxplot of residuals
boxplot <- ggplot(data = data.frame(x = fit$residuals, genotype = data$genotype), aes(x = genotype, y = x, fill = genotype))+
geom_boxplot() +
labs(title = "Boxplot of residuals", y = deparse(substitute(independent_variable)))
# qqPlot of residuals
qqplot <- qqPlot(fit$residuals)
# Show graphs
grid.arrange(density_histogram, boxplot, ncol = 2)
print(qqplot)
# Return logical vector for the outlayers values
return(abs(fit$residuals) > 2.0 * summary(fit)$sigma)
}
# Run the function on the dataset for all the variables
endpoint_clean[[variables[1]]][isoutlayer(endpoint[[variables[1]]], endpoint$Genotype)]
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## 31 101
## 31 100
## [1] 7.804054 5.242991 5.769168 1.924528
endpoint_clean[[variables[2]]][isoutlayer(endpoint[[variables[2]]], endpoint$Genotype)]
## [1] 55 77
## [1] 4.3 5.0 3.5 4.0
ATTENTION ICI CHANGER LES NOMS DES VARIABLES
# Violin and sina plots
violin1 <- ggplot(endpoint_clean, aes(y = DW_shoot_g, x = Genotype))+
geom_violin(aes(fill = Plant_type), color = "white", alpha = 0.4)+
geom_sina(size=1, aes(color = Plant_type))+
theme(legend.position = "none")+
labs(title = variables[1])+
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates
panel.grid.major.x = element_line(color = "lightgray", size = 0.5), # Paramètres de la grille
panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
violin2 <- ggplot(endpoint_clean, aes(y = DW_root_g, x = Genotype))+
geom_violin(aes(fill = Plant_type), color = "white", alpha = 0.4)+
geom_sina(size=1, aes(color = Plant_type))+
theme(legend.position = "none")+
labs(title = variables[2])+
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates
panel.grid.major.x = element_line(color = "lightgray", size = 0.5), # Paramètres de la grille
panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures
combined <- violin1 + violin2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")
## Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 1 rows containing non-finite values (`stat_sina()`).
ATTENTION ICI CHANGER LES NOMS DES VARIABLES
skim(endpoint_clean[, unlist(variables)])
| Name | endpoint_clean[, unlist(v… |
| Number of rows | 108 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| DW_shoot_g | 1 | 0.99 | 3.46 | 1.65 | 0.77 | 2.3 | 3.22 | 4.3 | 8 | ▅▇▅▂▁ |
| DW_root_g | 0 | 1.00 | 0.98 | 0.83 | 0.10 | 0.5 | 0.80 | 1.2 | 5 | ▇▂▁▁▁ |
# For "DW_shoot_g"
endpoint_clean %>%
group_by(Genotype) %>%
summarize(mean = mean(DW_shoot_g, na.rm=TRUE),
std.dev = sd(DW_shoot_g, na.rm=TRUE),
n_missing = sum(is.na(DW_shoot_g))) %>%
arrange(desc(mean)) %>% # sort
print(n=Inf) # print full table
## # A tibble: 9 × 4
## Genotype mean std.dev n_missing
## <fct> <dbl> <dbl> <int>
## 1 EPPN4_L 3.95 1.56 1
## 2 EPPN1_H 3.92 1.81 0
## 3 EPPN2_L 3.71 1.87 0
## 4 EPPN1_L 3.48 1.66 0
## 5 EPPN3_H 3.45 2.16 0
## 6 EPPN3_L 3.38 1.10 0
## 7 EPPN4_H 3.31 2.03 0
## 8 EPPN_T 3.15 1.43 0
## 9 EPPN2_H 2.81 1.18 0
# For "DW_root_g"
endpoint_clean %>%
group_by(Genotype) %>%
summarize(mean = mean(DW_root_g, na.rm=TRUE),
std.dev = sd(DW_root_g, na.rm=TRUE),
n_missing = sum(is.na(DW_root_g))) %>%
arrange(desc(mean)) %>% # sort
print(n=Inf) # print full table
## # A tibble: 9 × 4
## Genotype mean std.dev n_missing
## <fct> <dbl> <dbl> <int>
## 1 EPPN1_H 1.49 1.34 0
## 2 EPPN_T 1.07 1.29 0
## 3 EPPN3_H 1.05 1.07 0
## 4 EPPN2_L 0.983 0.595 0
## 5 EPPN1_L 0.975 0.567 0
## 6 EPPN4_L 0.9 0.533 0
## 7 EPPN3_L 0.892 0.378 0
## 8 EPPN4_H 0.817 0.629 0
## 9 EPPN2_H 0.667 0.438 0
La variable explicative(X) sera le génotype, variable catégorielle. Les réponses(Y) sont les données phénotypiques (dans ce cas-ci la FW_shoot_g et la Measured_plant_height_cm)
unique(endpoint$Genotype)
## [1] EPPN2_H EPPN_T EPPN1_H EPPN1_L EPPN3_L EPPN4_H EPPN2_L EPPN3_H EPPN4_L
## 9 Levels: EPPN_T EPPN1_H EPPN1_L EPPN2_H EPPN2_L EPPN3_H EPPN3_L ... EPPN4_L
variables
## [1] "DW_shoot_g" "DW_root_g"
ATTENTION ICI CHANGER LES VARIABLES ### 1. First linear models Firstly, we model the Y = X + r + c + e Where - Y is the phenotypic trait; - X the genotype; - r the row effect (fixed or random); - c the column effect (fixed or random);
Models for DW_shoot_g and FW_shoot_g with fixed or random effects of Row and Column.
# Linear model of "DW_shoot_g" with Rows and Columns as fixed effects
mod1 <- lm(DW_shoot_g ~ Genotype + Row + Column, endpoint_clean)
summary(mod1)
##
## Call:
## lm(formula = DW_shoot_g ~ Genotype + Row + Column, data = endpoint_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1929 -0.9016 -0.1632 0.9502 3.1679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.22535 0.83657 2.660 0.00946 **
## GenotypeEPPN1_H 0.64568 0.66386 0.973 0.33372
## GenotypeEPPN1_L 0.34784 0.66861 0.520 0.60435
## GenotypeEPPN2_H -0.40655 0.66618 -0.610 0.54344
## GenotypeEPPN2_L 0.63701 0.66634 0.956 0.34200
## GenotypeEPPN3_H 0.21468 0.66620 0.322 0.74812
## GenotypeEPPN3_L 0.37039 0.66623 0.556 0.57982
## GenotypeEPPN4_H 0.16653 0.66873 0.249 0.80399
## GenotypeEPPN4_L 0.79085 0.68435 1.156 0.25131
## Row2 0.54325 0.76386 0.711 0.47906
## Row3 0.09322 0.79172 0.118 0.90657
## Row4 0.21972 0.76386 0.288 0.77437
## Row5 0.77570 0.76386 1.015 0.31297
## Row6 0.39310 0.76386 0.515 0.60826
## Row7 0.88715 0.76386 1.161 0.24897
## Row8 -0.30547 0.76386 -0.400 0.69031
## Row9 -0.82113 0.76386 -1.075 0.28566
## Row10 0.55524 0.76386 0.727 0.46944
## Row11 -0.23882 0.76386 -0.313 0.75537
## Row12 0.38861 0.76386 0.509 0.61234
## Column2 0.03488 0.66870 0.052 0.95853
## Column3 1.68226 0.66623 2.525 0.01357 *
## Column4 0.55525 0.66859 0.830 0.40877
## Column5 1.34309 0.68180 1.970 0.05235 .
## Column6 0.57524 0.66621 0.863 0.39051
## Column7 1.33187 0.66874 1.992 0.04987 *
## Column8 -0.22411 0.66385 -0.338 0.73656
## Column9 1.24077 0.66623 1.862 0.06627 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.62 on 79 degrees of freedom
## (1 observation effacée parce que manquante)
## Multiple R-squared: 0.2838, Adjusted R-squared: 0.03899
## F-statistic: 1.159 on 27 and 79 DF, p-value: 0.3003
anova(mod1)
## Analysis of Variance Table
##
## Response: DW_shoot_g
## Df Sum Sq Mean Sq F value Pr(>F)
## Genotype 8 12.508 1.5635 0.5955 0.7789
## Row 11 24.134 2.1940 0.8356 0.6053
## Column 8 45.544 5.6930 2.1682 0.0388 *
## Residuals 79 207.427 2.6257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear model of "DW_shoot_g" with Rows and Columns as random effects
mod2 <- lmer(DW_shoot_g ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_shoot_g ~ Genotype + (1 | Row) + (1 | Column)
## Data: endpoint_clean
##
## REML criterion at convergence: 399.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8027 -0.7516 -0.1208 0.6252 2.4692
##
## Random effects:
## Groups Name Variance Std.Dev.
## Row (Intercept) 0.0000 0.0000
## Column (Intercept) 0.2687 0.5184
## Residual 2.5706 1.6033
## Number of obs: 107, groups: Row, 12; Column, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.1533 0.4957 76.6159 6.361 1.33e-08 ***
## GenotypeEPPN1_H 0.7040 0.6558 90.5805 1.073 0.286
## GenotypeEPPN1_L 0.3415 0.6584 91.6671 0.519 0.605
## GenotypeEPPN2_H -0.3729 0.6571 91.1193 -0.568 0.572
## GenotypeEPPN2_L 0.6035 0.6571 91.1568 0.918 0.361
## GenotypeEPPN3_H 0.2566 0.6571 91.1237 0.391 0.697
## GenotypeEPPN3_L 0.3077 0.6571 91.1321 0.468 0.641
## GenotypeEPPN4_H 0.1642 0.6584 91.6951 0.249 0.804
## GenotypeEPPN4_L 0.8041 0.6726 91.4143 1.196 0.235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GEPPN1_H GEPPN1_L GEPPN2_H GEPPN2_L GEPPN3_H GEPPN3_L
## GntyEPPN1_H -0.661
## GntyEPPN1_L -0.664 0.498
## GntyEPPN2_H -0.663 0.499 0.501
## GntyEPPN2_L -0.663 0.497 0.499 0.500
## GntyEPPN3_H -0.663 0.501 0.501 0.500 0.498
## GntyEPPN3_L -0.663 0.497 0.501 0.500 0.502 0.498
## GntyEPPN4_H -0.664 0.498 0.504 0.499 0.501 0.499 0.501
## GntyEPPN4_L -0.649 0.490 0.492 0.488 0.488 0.491 0.488
## GEPPN4_H
## GntyEPPN1_H
## GntyEPPN1_L
## GntyEPPN2_H
## GntyEPPN2_L
## GntyEPPN3_H
## GntyEPPN3_L
## GntyEPPN4_H
## GntyEPPN4_L 0.492
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
ranova(mod2)
## boundary (singular) fit: see help('isSingular')
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## DW_shoot_g ~ Genotype + (1 | Row) + (1 | Column)
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 12 -199.65 423.31
## (1 | Row) 11 -199.65 421.31 0.0000 1 1.00000
## (1 | Column) 11 -201.13 424.25 2.9479 1 0.08599 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear model of "DW_root_g" with Rows and Columns as random effects
mod3 <- lm(DW_root_g ~ Genotype + Row + Column, endpoint_clean)
summary(mod3)
##
## Call:
## lm(formula = DW_root_g ~ Genotype + Row + Column, data = endpoint_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4165 -0.4383 -0.1346 0.3884 3.1027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.53168 0.42684 1.246 0.2165
## GenotypeEPPN1_H 0.37533 0.33889 1.108 0.2714
## GenotypeEPPN1_L -0.08119 0.34131 -0.238 0.8126
## GenotypeEPPN2_H -0.38209 0.34007 -1.124 0.2646
## GenotypeEPPN2_L -0.06935 0.34010 -0.204 0.8389
## GenotypeEPPN3_H -0.03992 0.34008 -0.117 0.9069
## GenotypeEPPN3_L -0.15029 0.34010 -0.442 0.6598
## GenotypeEPPN4_H -0.27947 0.34133 -0.819 0.4154
## GenotypeEPPN4_L -0.20697 0.34130 -0.606 0.5460
## Row2 0.31111 0.38994 0.798 0.4273
## Row3 0.16667 0.38994 0.427 0.6702
## Row4 -0.05556 0.38994 -0.142 0.8871
## Row5 0.46667 0.38994 1.197 0.2349
## Row6 0.14444 0.38994 0.370 0.7120
## Row7 0.64444 0.38994 1.653 0.1023
## Row8 0.01111 0.38994 0.028 0.9773
## Row9 -0.05556 0.38994 -0.142 0.8871
## Row10 0.41111 0.38994 1.054 0.2949
## Row11 -0.25556 0.38994 -0.655 0.5141
## Row12 0.26667 0.38994 0.684 0.4960
## Column2 0.05549 0.34130 0.163 0.8713
## Column3 0.55875 0.34010 1.643 0.1043
## Column4 0.33024 0.34130 0.968 0.3362
## Column5 0.72117 0.34010 2.120 0.0371 *
## Column6 0.59243 0.34009 1.742 0.0854 .
## Column7 0.72249 0.34131 2.117 0.0374 *
## Column8 0.12649 0.33888 0.373 0.7099
## Column9 0.24179 0.34010 0.711 0.4792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8272 on 80 degrees of freedom
## Multiple R-squared: 0.2605, Adjusted R-squared: 0.01086
## F-statistic: 1.044 on 27 and 80 DF, p-value: 0.4257
anova(mod3)
## Analysis of Variance Table
##
## Response: DW_root_g
## Df Sum Sq Mean Sq F value Pr(>F)
## Genotype 8 4.959 0.61988 0.9060 0.5159
## Row 11 6.643 0.60393 0.8826 0.5605
## Column 8 7.676 0.95949 1.4023 0.2084
## Residuals 80 54.738 0.68423
# Linear model of "DW_root_g" with Rows and Columns as random effects
mod4 <- lmer(DW_root_g ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_root_g ~ Genotype + (1 | Row) + (1 | Column)
## Data: endpoint_clean
##
## REML criterion at convergence: 267.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5625 -0.5655 -0.2342 0.3342 4.6592
##
## Random effects:
## Groups Name Variance Std.Dev.
## Row (Intercept) 1.704e-15 4.128e-08
## Column (Intercept) 2.436e-02 1.561e-01
## Residual 6.743e-01 8.211e-01
## Number of obs: 108, groups: Row, 12; Column, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.06919 0.24317 90.91275 4.397 2.97e-05 ***
## GenotypeEPPN1_H 0.41021 0.33558 91.94626 1.222 0.225
## GenotypeEPPN1_L -0.08852 0.33629 93.57954 -0.263 0.793
## GenotypeEPPN2_H -0.39473 0.33593 92.77403 -1.175 0.243
## GenotypeEPPN2_L -0.07926 0.33593 92.78438 -0.236 0.814
## GenotypeEPPN3_H -0.02353 0.33593 92.77748 -0.070 0.944
## GenotypeEPPN3_L -0.16777 0.33593 92.78441 -0.499 0.619
## GenotypeEPPN4_H -0.25885 0.33629 93.58608 -0.770 0.443
## GenotypeEPPN4_L -0.17861 0.33629 93.57631 -0.531 0.597
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GEPPN1_H GEPPN1_L GEPPN2_H GEPPN2_L GEPPN3_H GEPPN3_L
## GntyEPPN1_H -0.690
## GntyEPPN1_L -0.691 0.499
## GntyEPPN2_H -0.691 0.499 0.501
## GntyEPPN2_L -0.691 0.498 0.499 0.500
## GntyEPPN3_H -0.691 0.501 0.501 0.500 0.499
## GntyEPPN3_L -0.691 0.498 0.501 0.500 0.501 0.499
## GntyEPPN4_H -0.691 0.499 0.502 0.499 0.501 0.499 0.501
## GntyEPPN4_L -0.691 0.500 0.501 0.499 0.501 0.501 0.499
## GEPPN4_H
## GntyEPPN1_H
## GntyEPPN1_L
## GntyEPPN2_H
## GntyEPPN2_L
## GntyEPPN3_H
## GntyEPPN3_L
## GntyEPPN4_H
## GntyEPPN4_L 0.502
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
anova(mod4)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Genotype 4.7479 0.59349 8 92.974 0.8802 0.5362
ranova(mod4)
## boundary (singular) fit: see help('isSingular')
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## DW_root_g ~ Genotype + (1 | Row) + (1 | Column)
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 12 -133.57 291.14
## (1 | Row) 11 -133.57 289.14 0.00000 1 1.0000
## (1 | Column) 11 -133.83 289.66 0.51949 1 0.4711
# Linear model of Soil?
#mod3 <- lm(Soil ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
#summary(mod3)
#anova(mod3)
Model with X as Plant_type instead of Genotype, and row and column effects as random effects. Plant_type is defined as H for Hybrid, L for pure Line and T for Tester.
# Linear model of DW_shoot_g with Rows and Columns as random effects
endpoint_clean$Plant_type <- as.factor(endpoint_clean$Plant_type)
# Rajouter colonne avec le "numéro" de génotype
endpoint_clean$Plant_type <- relevel(endpoint_clean$Plant_type, ref = "T") # T as base level
mod <- lmer(DW_shoot_g ~ Plant_type + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_shoot_g ~ Plant_type + (1 | Row) + (1 | Column)
## Data: endpoint_clean
##
## REML criterion at convergence: 407.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7872 -0.7322 -0.1474 0.5911 2.5756
##
## Random effects:
## Groups Name Variance Std.Dev.
## Row (Intercept) 0.0000 0.000
## Column (Intercept) 0.2883 0.537
## Residual 2.4959 1.580
## Number of obs: 107, groups: Row, 12; Column, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.1516 0.4917 76.9543 6.410 1.06e-08 ***
## Plant_typeH 0.1873 0.5118 96.9742 0.366 0.715
## Plant_typeL 0.5131 0.5136 97.3840 0.999 0.320
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Plnt_H
## Plant_typeH -0.833
## Plant_typeL -0.832 0.798
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
modasreml <- asreml(fixed = DW_shoot_g ~ Genotype,
random = ~ Row + Column,
residual = ~ NULL,
data = endpoint_clean)
## ASReml Version 4.2 24/05/2024 09:10:35
## LogLik Sigma2 DF wall
## 1 -110.8407 2.459644 98 09:10:35
## 2 -109.7897 2.537136 98 09:10:35 ( 1 restrained)
## 3 -109.6061 2.567283 98 09:10:35 ( 1 restrained)
## 4 -109.5973 2.570370 98 09:10:35 ( 1 restrained)
## 5 -109.5968 2.570553 98 09:10:35 ( 1 restrained)
plot(modasreml)
summary(modasreml)$varcomp
## component std.error z.ratio bound %ch
## Column 2.687275e-01 0.2464866 1.090232 P 0
## Row 8.472656e-07 NA NA B NA
## units!R 2.570553e+00 0.4046865 6.351961 P 0
PROBLEME DANS CE BLOC
Model with Soil as explicative variable.
PROBLEME DANS CE BLOC
PROBLEME DANS CE BLOC PCA, clustering, etc, voir p.56 biométrie 1
In this part, we look at the timeseries, S_timeseries and T_timeseries datasets.
REPLACER DANS LE CODE APRES
h1 <- ggplot(timeseries, aes(x = Date)) + geom_bar(aes(fill = Genotype), position = “stack”, width = 1) + scale_fill_viridis_d(option = “D”) + labs(x = “Date”, y = “Number of observations”, title = “Observations per day for timeseries_shoot_and_plant”) + scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) + scale_x_date(date_breaks = “2 days”, date_labels = “%d-%m-%Y”) + # Exemple de format de date (%d-%m-%Y) theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates panel.grid.major.x = element_line(color = “lightgray”, size = 0.5), # Paramètres de la grille panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures
h3 <- ggplot(T_timeseries, aes(x = Date)) + geom_bar(aes(fill = Genotype), position = “stack”, width = 1) + scale_fill_viridis_d(option = “D”) + labs(x = “Date”, y = “Number of observations”, title = “Observations per day for T_timeseries_shoot”) + scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) + scale_x_date(date_breaks = “2 days”, date_labels = “%d-%m-%Y”) + # Exemple de format de date (%d-%m-%Y) theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates panel.grid.major.x = element_line(color = “lightgray”, size = 0.5), # Paramètres de la grille panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures
combined <- h1 + h2 + h3 & theme(legend.position = “top”) combined + plot_layout(guides = “collect”)
h2 <- ggplot(S_timeseries, aes(x = Date)) +
geom_bar(aes(fill = Genotype), position = "stack", width = 0.96) +
scale_fill_viridis_d(option = "D") +
labs(x = "Date", y = "Number of observations", title = "Observations per day for S_timeseries") +
scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) +
scale_x_date(date_breaks = "2 days", date_labels = "%d-%m-%Y") + # Exemple de format de date (%d-%m-%Y)
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates
panel.grid.major.x = element_line(color = "lightgray", size = 0.5), # Paramètres de la grille
panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures
h2
Firsty, we extract the variables of the timeseries dataframe.
variables_1 <- "S_Height_cm"
timePoint_endpoint <- createTimePoints(dat = endpoint,
experimentName = "EPPN2020_UCPH",
genotype = "Genotype",
timePoint = "Date",
plotId = "Unit.ID",
rowNum = "Row",
colNum = "Column")
summary(timePoint_endpoint)
## timePoint_endpoint contains data for experiment EPPN2020_UCPH.
##
## It contains 1 time points.
## First time point: 2021-06-28
## Last time point: 2021-06-28
##
## No check genotypes are defined.
getTimePoints(timePoint_endpoint)
## timeNumber timePoint
## 1 1 2021-06-28
Count the number of observations per trait.
traits <- variables
for (trait_name in traits) {
print(paste("How many observations for", trait_name))
num_observations <- countValid(timePoint_endpoint, trait_name)
print(num_observations)
}
## [1] "How many observations for DW_shoot_g"
## 2021-06-28
## 107
## [1] "How many observations for DW_root_g"
## 2021-06-28
## 108
Check the layout at the only timepoint.
getTimePoints(timePoint_endpoint)
## timeNumber timePoint
## 1 1 2021-06-28
num_timepoints <- getTimePoints(timePoint_endpoint)[1,1]
plot(timePoint_endpoint,
plotType = "layout",
timePoints = num_timepoints,
highlight = c("EPPN_T", "EPPN1_L", "EPPN1_H", "EPPN2_L", "EPPN2_H", "EPPN3_L", "EPPN3_H", "EPPN4_L", "EPPN4_H"))
Check the heatmap of the raw data at harvest.
for (trait_name in traits) {
plot(timePoint_endpoint,
plotType = "layout",
timePoints = num_timepoints,
traits = trait_name)
}
timePoint_endpoint_clean <- createTimePoints(dat = endpoint_clean,
experimentName = "EPPN2020_NaPPI",
genotype = "Genotype",
timePoint = "Date",
plotId = "Unit.ID",
rowNum = "Row",
colNum = "Column")
Count the number of observations per trait
for (trait_name in traits) {
print(paste("How many observations for", trait_name))
num_observations <- countValid(timePoint_endpoint_clean, trait_name)
print(num_observations)
}
## [1] "How many observations for DW_shoot_g"
## 2021-06-28
## 107
## [1] "How many observations for DW_root_g"
## 2021-06-28
## 108
Check the heatmap of the data at harvest.
for (trait_name in traits) {
plot(timePoint_endpoint_clean,
plotType = "layout",
timePoints = num_timepoints,
traits = trait_name)
}
For the createTimePoints we need one timepoint per Unit.ID per day. We can average the variables per day.
# Agréger les données
S_timeseries <- S_timeseries %>%
group_by(Unit.ID, Date) %>%
mutate(S_Height_cm_avg = mean(S_Height_cm)) %>%
distinct(Unit.ID, Date, .keep_all = TRUE) %>%
ungroup()
# We add a Plant_type variable
S_timeseries$Plant_type <- substr(S_timeseries$Genotype, nchar(as.character(S_timeseries$Genotype)), nchar(as.character(S_timeseries$Genotype)))
S_timeseries$Plant_type <- ifelse(S_timeseries$Plant_type %in% c("T", "L"), "Line",
ifelse(S_timeseries$Plant_type == "H", "Hybrid", S_timeseries$Plant_type))
timePoint_S <- createTimePoints(dat = S_timeseries,
experimentName = "EPPN2020_NaPPI",
genotype = "Genotype",
timePoint = "Date",
plotId = "Unit.ID",
rowNum = "Row",
colNum = "Column")
summary(timePoint_S)
## timePoint_S contains data for experiment EPPN2020_NaPPI.
##
## It contains 17 time points.
## First time point: 2021-06-13
## Last time point: 2021-06-29
##
## No check genotypes are defined.
getTimePoints(timePoint_S)
## timeNumber timePoint
## 1 1 2021-06-13
## 2 2 2021-06-14
## 3 3 2021-06-15
## 4 4 2021-06-16
## 5 5 2021-06-17
## 6 6 2021-06-18
## 7 7 2021-06-19
## 8 8 2021-06-20
## 9 9 2021-06-21
## 10 10 2021-06-22
## 11 11 2021-06-23
## 12 12 2021-06-24
## 13 13 2021-06-25
## 14 14 2021-06-26
## 15 15 2021-06-27
## 16 16 2021-06-28
## 17 17 2021-06-29
We choose the variables that we want to see. Count the number of observations per variable.
traits <- "S_Height_cm"
trait_name <- traits[1]
for (trait in traits) {
print(paste("How many observations for", trait))
valid_count <- countValid(timePoint_S, trait)
print(valid_count)
}
## [1] "How many observations for S_Height_cm"
## 2021-06-13 2021-06-14 2021-06-15 2021-06-16 2021-06-17 2021-06-18 2021-06-19
## 108 108 108 108 108 108 108
## 2021-06-20 2021-06-21 2021-06-22 2021-06-23 2021-06-24 2021-06-25 2021-06-26
## 108 108 108 108 108 108 108
## 2021-06-27 2021-06-28 2021-06-29
## 66 108 84
Check the genotypic layout at every timepoint.
getTimePoints(timePoint_S)
## timeNumber timePoint
## 1 1 2021-06-13
## 2 2 2021-06-14
## 3 3 2021-06-15
## 4 4 2021-06-16
## 5 5 2021-06-17
## 6 6 2021-06-18
## 7 7 2021-06-19
## 8 8 2021-06-20
## 9 9 2021-06-21
## 10 10 2021-06-22
## 11 11 2021-06-23
## 12 12 2021-06-24
## 13 13 2021-06-25
## 14 14 2021-06-26
## 15 15 2021-06-27
## 16 16 2021-06-28
## 17 17 2021-06-29
num_timepoints <- getTimePoints(timePoint_S)
for (i in 1:length(num_timepoints$timeNumber)) {
plot(timePoint_S,
plotType = "layout",
timePoints = i,
highlight = c("EPPN_T", "EPPN1_L", "EPPN1_H", "EPPN2_L", "EPPN2_H", "EPPN3_L", "EPPN3_H", "EPPN4_L", "EPPN4_H"))
}
Check the heatmap of the raw data at all the time points
for (trait in traits) {
for (tp in 1:length(num_timepoints$timeNumber)) {
plot(timePoint_S,
plotType = "layout",
timePoints = tp,
traits = trait)
}
}
Check some time courses of raw data
for (trait in traits) {
plot(timePoint_S,
traits = trait,
plotType = "raw")
}
for (trait in traits) {
plot(timePoint_S,
plotType = "box",
traits = trait)
}
for (trait in traits) {
plot(timePoint_S,
plotType = "cor",
traits = trait)
}
Using the SingleOut detect and single functions. We select a subset of plants to adjust the settings for the confIntSize and nnLocfit.
plantSel<- c(1,2,3,4,5,6,7,8,9,10)
cutoffA <- 1
ci <- c(3)[cutoffA] # confidence interval
nn <- c(0.6)[cutoffA] # nearest neighbour (0 - 1 greater number smoother curve)
ce <- c(FALSE)[cutoffA]
For all the traits
for (trait_name in traits) {
variable_name <- paste0("Single_test_", trait_name)
single_test <- detectSingleOut(
TP = timePoint_S,
trait = trait_name,
plotIds = plantSel,
confIntSize = ci,
nnLocfit = nn,
checkEdges = TRUE
)
assign(variable_name, single_test)
plot(single_test, outOnly = FALSE)
}
We can then run on all plants.
for (trait_name in traits) {
single_test_object_name <- paste0("Single_test_", trait_name)
Single_test <- get(single_test_object_name)
# Vérifier s'il y a des valeurs aberrantes
if (any(Single_test$outlier == 1)) {
# Compter le nombre de valeurs aberrantes par date
outliers_count <- with(Single_test[Single_test$outlier == 1,], table(timePoint))
print(trait_name)
print(outliers_count)
# Supprimer les valeurs aberrantes uniquement si elles existent
Single_outliers <- removeSingleOut(timePoint_S, Single_test)
assign(paste0("Single_outliers_", trait_name), Single_outliers)
# Écrire les résultats dans un fichier TSV
readr::write_tsv(Single_test, sprintf("%s/single_outliers_%s.tsv", datadir, trait_name))
} else {
cat("No outlier for", trait_name, "\n")
}
}
## [1] "S_Height_cm"
## timePoint
## 2021-06-15 2021-06-17 2021-06-22 2021-06-24 2021-06-27 2021-06-29
## 1 1 2 1 2 1
Check the heatmap of the data with outliers detection at all the time points.
for (trait_name in traits) {
single_outliers_name <- paste0("Single_outliers_", trait_name)
if (exists(single_outliers_name)) {
Single_outliers <- get(single_outliers_name)
for (tp in 1:length(num_timepoints$timeNumber)) {
plot(Single_outliers,
plotType = "layout",
timePoints = tp,
traits = trait_name)
}
} else {
cat("Aucun objet Single_outliers trouvé pour le trait", trait_name, "\n")
}
}
for (trait_name in traits) {
single_outliers_name <- paste0("Single_outliers_", trait_name)
if (exists(single_outliers_name)) {
Single_outliers <- get(single_outliers_name)
plot(Single_outliers,
traits = trait_name,
plotType = "raw")
plot(Single_outliers,
plotType = "box",
traits = trait_name)
plot(Single_outliers,
plotType = "cor",
traits = trait_name)
} else {
cat("No Single_outliers object found for trait", trait_name, "\n")
}
}
Fit a model for all time points with no extra fixed effects.
for (trait_name in traits) {
# Construire le nom de l'objet Single_outliers pour le trait actuel
single_outliers_name <- paste0("Single_outliers_", trait_name)
# Vérifier s'il existe un objet Single_outliers pour ce trait
if (exists(single_outliers_name)) {
# Récupérer l'objet Single_outliers correspondant
Single_outliers <- get(single_outliers_name)
# Créer le modèle modTP pour le trait actuel
assign(paste0("modTP_", trait_name),
fitModels(TP = Single_outliers,
trait = trait_name,
geno.decomp = "Plant_type"))
} else {
# Si aucun objet Single_outliers n'est trouvé, utiliser timePoint_S comme TP et le nom du trait
assign(paste0("modTP_", trait_name),
fitModels(TP = timePoint_S,
trait = trait_name,
geno.decomp = "Plant_type"))
}
}
## 2021-06-13
## 2021-06-14
## 2021-06-15
## 2021-06-16
## 2021-06-17
## 2021-06-18
## 2021-06-19
## 2021-06-20
## 2021-06-21
## 2021-06-22
## 2021-06-23
## 2021-06-24
## 2021-06-25
## 2021-06-26
## 2021-06-27
## 2021-06-28
## 2021-06-29
for (trait_name in traits) {
mod_name <- paste0("modTP_", trait_name)
if (exists(mod_name)) {
mod <- get(mod_name)
for (tp in 1:length(num_timepoints$timeNumber)) {
plot(mod,
timePoints = tp,
plotType = "spatial",
spaTrend = "percentage")
}
gif_file <- sprintf("%s/%s_mod.gif", datadir, trait_name)
plot(mod,
plotType = "timeLapse",
spaTrend = "percentage",
outFile = gif_file)
} else {
cat("Aucun modèle trouvé pour le trait", trait_name, "\n")
}
}
## Output at: C:/Users/elise/Documents/Mémoire/Template/UCPH/UCPH_Template/testtemplate/S_Height_cm_mod.gif
for (trait_name in traits) {
mod_name <- paste0("modTP_", trait_name)
if (exists(mod_name)) {
mod <- get(mod_name)
plot(mod,
plotType = "rawPred",
plotLine = TRUE)
plot(mod,
plotType = "corrPred",
plotLine = TRUE)
plot(mod,
plotType = "herit",
yLim = c(0, 0.5))
plot(mod,
plotType = "effDim",
EDType = "ratio",
yLim = c(0, 1))
# Tracer le graphique pour les valeurs corrigées spatialement
plot(mod,
plotType = "corrPred")
} else {
cat("Aucun modèle trouvé pour le trait", trait_name, "\n")
}
}
trait_name <- "S_Height_cm"
modTP <- fitModels(TP = Single_outliers_S_Height_cm,
trait = trait_name,
geno.decomp = "Plant_type")
## 2021-06-13
## 2021-06-14
## 2021-06-15
## 2021-06-16
## 2021-06-17
## 2021-06-18
## 2021-06-19
## 2021-06-20
## 2021-06-21
## 2021-06-22
## 2021-06-23
## 2021-06-24
## 2021-06-25
## 2021-06-26
## 2021-06-27
## 2021-06-28
## 2021-06-29
Spatial_Corrected <- getCorrected(modTP)
#Fit the spline for every plant
knots <- c(30)
mintimepoints <- c(9)
fit.spline <- fitSpline(inDat = Spatial_Corrected,
trait = paste0(trait_name, "_corr"),
knots = knots,
minNoTP = mintimepoints)
# Extracting the tables of predicted values and P-spline coefficients
predDat <- fit.spline$predDat
coefDat <- fit.spline$coefDat
thrCor <- c(0.9) # correlation threshold
thrPca <- c(30) # pca angle threshold
thrSlope <- c(0.7) # slope threshold
Series_test <- detectSerieOut(corrDat = Spatial_Corrected,
predDat = predDat,
coefDat = coefDat,
trait = paste0(trait_name, "_corr"),
thrCor = thrCor,
thrPca = thrPca,
thrSlope = thrSlope,
geno.decomp = "geno.decomp")
plot(Series_test, genotypes = levels(factor(Series_test$genotype)))
# Spatial_Corrected_Out <- Spatial_Corrected
Spatial_Corrected_Out <- removeSerieOut(dat = Spatial_Corrected,
serieOut = Series_test)
readr::write_tsv(Spatial_Corrected_Out, sprintf( "%s/timeSeriesOutliers_%s.tsv", datadir, trait_name))